Fork me on GitHub

hive自定义UDTF函数

自定义UDTF函数

概述

用户自定义表生成函数(UDTF)接受零个或多个输入,然后产生多列或多行的输出。要实现UDTF,需要继承org.apache.hadoop.hive.ql.udf.generic.GenericUDTF,同时实现三个方法。

参考文章

https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF
https://javinjunfeng.top/technicalstack/hive/77

案例

原始数据和转换后的数据

原始数据

sku,category_name,busi_time,shop_id,flag,sale_num,income
21A5+32B2,烟灶,2018-06-29 00:00:00,21,0,6,16056.0

转换后的数据

rela_sku,rule_id,sku,category_name,busi_time,shop_id,sale_num,income
21A5,0,21A5+32B2,烟机,2018-06-29 00:00:00,21,6,16056.0
32B2,1,21A5+32B2,灶具,2018-06-29 00:00:00,21,6,16056.0

创建自定义函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
package com.xh.spark.sql.hive.udtf;

import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;

import java.util.ArrayList;
import java.util.List;
public class JDQuotaReduction extends GenericUDTF {
@Override
public StructObjectInspector initialize(StructObjectInspector args) throws UDFArgumentException {
ArrayList<String> fieldNames = new ArrayList<>();
ArrayList<ObjectInspector> resType = new ArrayList<>();
fieldNames.add("rela_sku");
resType.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
fieldNames.add("rule_id");
resType.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, resType);
}

@Override
public void process(Object[] record) throws HiveException {
List<Object> allQuota = (ArrayList<Object>) record[0];
List<Object> firstQuotaValueOrder = (ArrayList<Object>) allQuota.get(0);
String[] sku = firstQuotaValueOrder.get(0).toString().split("\\+");
String cate = firstQuotaValueOrder.get(1).toString();
String flag = firstQuotaValueOrder.get(2).toString();
// 烟灶套餐
if (cate != null && "烟灶".equals(cate)) {
Object[] result = new Object[2];
//第一件单品
result[0] = sku[0];
result[1] = "0";
forward(result);
//第二件的单品
result[0] = sku[1];
result[1] = "1";
forward(result);
return;
}
// 嵌入式套餐
if (cate != null && "嵌入式".equals(cate)) {
Object[] result = new Object[2];
// 第一件单品
result[0] = sku[0];
result[1] = "2";
forward(result);
// 第二件及之后的单品
for (int i = 1; i < sku.length; i++) {
result[0] = sku[i];
result[1] = "1";
forward(result);
}
return;
}
// 烟灶消套餐及大套系
if (cate != null && ("烟灶消".equals(cate) || "大套系".equals(cate))) {
Object[] result = new Object[2];
// 第二件及之后的单品
for (int i = 1; i < sku.length; i++) {
result[0] = sku[i];
result[1] = "1";
forward(result);
}

if (flag != null && "1".equals(flag)) {
// 第一件+第二件的组合是烟灶套餐
result[0] = sku[0] + "+" + sku[1];
result[1] = 3;
forward(result);
} else {
result[0] = sku[0];
result[1] = "2";
forward(result);
}
}
}

@Override
public void close() throws HiveException {

}
}

创建表

1
2
3
4
5
6
7
8
9
10
11
12
13
create table if not exists  transct_quota(
sku string,
category_name string,
busi_time string,
shop_id string,
flag string,
sale_num int,
income double
)
row format delimited
fields terminated by ','
stored as textfile
;

插入数据

1
insert into transct_quota values ("21A5+32B2","烟灶","2018-06-29 00:00:00","21","0",6,16056.0);

在hive命令窗口添加函数jar包

1
add jar /home/hadoop/sparklearning-1.0-SNAPSHOT.jar;

在hive命令窗口创建临时函数

1
create temporary function combine_sku_reduction_rule as 'com.xh.spark.sql.hive.udtf.JDQuotaReduction';

使用自定义函数

1
select c1,c2,jd_quota.* from transct_quota LATERAL VIEW combine_sku_reduction_rule(array(named_struct('sku', goods_sku_id, 'cate', category_name,'flag',flag))) sku_tab  as c1,c2;

本文标题:hive自定义UDTF函数

文章作者:tang

发布时间:2019年04月24日 - 14:04

最后更新:2019年04月24日 - 14:04

原始链接:https://tgluon.github.io/2019/04/24/hive 自定义UDTF函数/

许可协议: 署名-非商业性使用-禁止演绎 4.0 国际 转载请保留原文链接及作者。

-------------本文结束感谢您的阅读-------------